Import Libraries¶

In [156]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Dataset Overview¶

In [157]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
In [158]:
print(train.info())
print(train.describe())
print(train.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
None
                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ...   
min       1.000000  1872.000000   1950.000000     0.000000     0.000000  ...   
25%       5.000000  1954.000000   1967.000000     0.000000     0.000000  ...   
50%       5.000000  1973.000000   1994.000000     0.000000   383.500000  ...   
75%       6.000000  2000.000000   2004.000000   166.000000   712.250000  ...   
max       9.000000  2010.000000   2010.000000  1600.000000  5644.000000  ...   

        WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  ScreenPorch  \
count  1460.000000  1460.000000    1460.000000  1460.000000  1460.000000   
mean     94.244521    46.660274      21.954110     3.409589    15.060959   
std     125.338794    66.256028      61.119149    29.317331    55.757415   
min       0.000000     0.000000       0.000000     0.000000     0.000000   
25%       0.000000     0.000000       0.000000     0.000000     0.000000   
50%       0.000000    25.000000       0.000000     0.000000     0.000000   
75%     168.000000    68.000000       0.000000     0.000000     0.000000   
max     857.000000   547.000000     552.000000   508.000000   480.000000   

          PoolArea       MiscVal       MoSold       YrSold      SalePrice  
count  1460.000000   1460.000000  1460.000000  1460.000000    1460.000000  
mean      2.758904     43.489041     6.321918  2007.815753  180921.195890  
std      40.177307    496.123024     2.703626     1.328095   79442.502883  
min       0.000000      0.000000     1.000000  2006.000000   34900.000000  
25%       0.000000      0.000000     5.000000  2007.000000  129975.000000  
50%       0.000000      0.000000     6.000000  2008.000000  163000.000000  
75%       0.000000      0.000000     8.000000  2009.000000  214000.000000  
max     738.000000  15500.000000    12.000000  2010.000000  755000.000000  

[8 rows x 38 columns]
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD         Normal     208500  
1   2007        WD         Normal     181500  
2   2008        WD         Normal     223500  
3   2006        WD        Abnorml     140000  
4   2008        WD         Normal     250000  

[5 rows x 81 columns]

Missing Data¶

In [159]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
missing_features = train.columns[train.isnull().any()].tolist()
PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64

Visualize Price Distribution¶

In [160]:
# Plot SalePrice distribution (original)
sns.set(style="whitegrid")
plt.figure(figsize=(10, 5))
sns.histplot(train['SalePrice'], kde=True, bins=40, color='skyblue')
plt.title('Sale Price Distribution (Original Scale)', fontsize=14)
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Plot SalePrice distribution (log-transformed)
plt.figure(figsize=(10, 5))
sns.histplot(np.log1p(train['SalePrice']), kde=True, bins=40, color='salmon')
plt.title('Sale Price Distribution (Log-Transformed)', fontsize=14)
plt.xlabel('Log(Sale Price)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image

Visualize Features vs Target¶

In [161]:
all_features = [col for col in train.columns if col not in ['Id', 'SalePrice']]

for feature in all_features:
    plt.figure(figsize=(10, 6))
    
    if train[feature].dtype == 'object':  # Categorical features
        sns.boxplot(data=train, x=feature, y='SalePrice')
        plt.title(f'Boxplot of SalePrice by {feature}')
        plt.xticks(rotation=45)
    else:  # Numerical features
        sns.scatterplot(data=train, x=feature, y='SalePrice', alpha=0.6)
        plt.title(f'Scatterplot of SalePrice vs {feature}')
    
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.tight_layout()
    
    save_path = os.path.join("..\\visualizations", f"{feature}_SalePrice_plot.png")
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()
    plt.close()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Visualize Feature Correlation Heatmap¶

In [162]:
# Only numeric columns
numeric_df = train.select_dtypes(include=[np.number])

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Sort features with highest absolute correlation to SalePrice
target_corr = corr_matrix['SalePrice'].drop('SalePrice').abs().sort_values(ascending=False)

# Select top 20 most correlated features with SalePrice
top_features = target_corr.head(20).index.tolist() + ['SalePrice']

# Plot heatmap
plt.figure(figsize=(14, 10))
sns.set(font_scale=1.1)  # Slightly increase font size
sns.heatmap(train[top_features].corr(), annot=True, fmt=".2f", cmap="coolwarm",
            cbar_kws={"label": "Correlation Coefficient"}, square=True)

plt.title("Top Feature Correlations with SalePrice", fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
No description has been provided for this image

Data Cleaning¶

In [163]:
# PoolQC
train['PoolQC'].fillna('NoPool', inplace=True)
pool_map = {'NoPool': 0, 'Fa': 1, 'Gd': 2, 'Ex': 3}
train['PoolQC'] = train['PoolQC'].map(pool_map)
In [164]:
# MisFeature
train['MiscFeature'].fillna('None', inplace=True)
misc_map = {'None': 0, 'Shed': 1, 'Gar2': 2, 'Othr': 3, 'TenC': 4}
train['MiscFeature'] = train['MiscFeature'].map(misc_map)
In [165]:
# Alley
train['Alley'].fillna('NoAlley', inplace=True)
alley_map = {'NoAlley': 0, 'Grvl': 1, 'Pave': 2}
train['Alley'] = train['Alley'].map(alley_map)
In [166]:
# Fence
train['Fence'].fillna('NoFence', inplace=True)
fence_map = {'NoFence': 0, 'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4}
train['Fence'] = train['Fence'].map(fence_map)
In [167]:
# FireplaceQu
train['FireplaceQu'].fillna('NoFireplace', inplace=True)
fireplace_map = {'NoFireplace': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['FireplaceQu'] = train['FireplaceQu'].map(fireplace_map)
In [168]:
# LotFrontage
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [169]:
# GarageYrBlt GarageFinish, GarageQual, GarageCond
train['GarageYrBlt'].fillna(0, inplace=True)
garage_categorical_features = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_categorical_features:
    train[col].fillna('NoGarage', inplace=True)

garage_type_map = {'NoGarage': 0, 'Attchd': 1, 'Detchd': 2, 'BuiltIn': 3, 'Basment': 4, 'CarPort': 5, '2Types': 6}
train['GarageType'] = train['GarageType'].map(garage_type_map)

garage_finish_map = {'NoGarage': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
train['GarageFinish'] = train['GarageFinish'].map(garage_finish_map)

garage_qual_cond_map = {'NoGarage': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['GarageQual'] = train['GarageQual'].map(garage_qual_cond_map)
train['GarageCond'] = train['GarageCond'].map(garage_qual_cond_map)
In [170]:
# BsmtFinType2, BsmtExposure, BsmtQual, BsmtCond, BsmtFinType1
bsmt_categorical_features = ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond']
for col in bsmt_categorical_features:
    train[col].fillna('NoBasement', inplace=True)

bsmt_numerical_features = ['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtHalfBath']
for col in bsmt_numerical_features:
    if col in train.columns:
        train[col].fillna(0, inplace=True)

bsmt_fin_type_map = {'NoBasement': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
train['BsmtFinType1'] = train['BsmtFinType1'].map(bsmt_fin_type_map)
train['BsmtFinType2'] = train['BsmtFinType2'].map(bsmt_fin_type_map)

bsmt_exposure_map = {'NoBasement': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
train['BsmtExposure'] = train['BsmtExposure'].map(bsmt_exposure_map)

bsmt_qual_cond_map = {'NoBasement': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train['BsmtQual'] = train['BsmtQual'].map(bsmt_qual_cond_map)
train['BsmtCond'] = train['BsmtCond'].map(bsmt_qual_cond_map)
In [171]:
# MasVnrType, MasVnrArea
train['MasVnrType'].fillna('None', inplace=True)
train['MasVnrArea'].fillna(0, inplace=True)

masvnr_type_map = {'None': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3}
train['MasVnrType'] = train['MasVnrType'].map(masvnr_type_map)
In [172]:
# Electrical
train['Electrical'].fillna('SBrkr', inplace=True)
electrical_map = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}
train['Electrical'] = train['Electrical'].map(electrical_map)
In [173]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [174]:
categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = train.select_dtypes(exclude=['object']).columns.tolist()

print("Categorical Columns:")
print(categorical_columns)

print("\nNon-Categorical Columns:")
print(non_categorical_columns)
Categorical Columns:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Non-Categorical Columns:
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
In [175]:
ordinal_features = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 
                    'HeatingQC', 'KitchenQual', 'Functional', 'PavedDrive', 'CentralAir']
ordinal_mappings = {
    'LotShape': {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
    'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
    'LandSlope': {'Gtl': 2, 'Mod': 1, 'Sev': 0},
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1}
}


for feature, mapping in ordinal_mappings.items():
    train[feature] = train[feature].map(mapping)

one_hot_features = [col for col in categorical_columns if col not in ordinal_features]

train = pd.get_dummies(train, columns=one_hot_features, drop_first=True)
In [176]:
missing_data = train.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [177]:
print(train.info())
print(train.describe())
print(train.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 190 entries, Id to SaleCondition_Partial
dtypes: float64(3), int64(61), uint8(126)
memory usage: 909.8 KB
None
                Id   MSSubClass  LotFrontage        LotArea        Alley  \
count  1460.000000  1460.000000  1460.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.199658   10516.828082     0.090411   
std     421.610009    42.300571    22.431902    9981.264932     0.372151   
min       1.000000    20.000000    21.000000    1300.000000     0.000000   
25%     365.750000    20.000000    60.000000    7553.500000     0.000000   
50%     730.500000    50.000000    70.000000    9478.500000     0.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     0.000000   
max    1460.000000   190.000000   313.000000  215245.000000     2.000000   

          LotShape    Utilities    LandSlope  OverallQual  OverallCond  ...  \
count  1460.000000  1460.000000  1460.000000  1460.000000  1460.000000  ...   
mean      2.591781     2.998630     1.937671     6.099315     5.575342  ...   
std       0.582296     0.052342     0.276232     1.382997     1.112799  ...   
min       0.000000     1.000000     0.000000     1.000000     1.000000  ...   
25%       2.000000     3.000000     2.000000     5.000000     5.000000  ...   
50%       3.000000     3.000000     2.000000     6.000000     5.000000  ...   
75%       3.000000     3.000000     2.000000     7.000000     6.000000  ...   
max       3.000000     3.000000     2.000000    10.000000     9.000000  ...   

       SaleType_ConLI  SaleType_ConLw  SaleType_New  SaleType_Oth  \
count     1460.000000     1460.000000   1460.000000   1460.000000   
mean         0.003425        0.003425      0.083562      0.002055   
std          0.058440        0.058440      0.276824      0.045299   
min          0.000000        0.000000      0.000000      0.000000   
25%          0.000000        0.000000      0.000000      0.000000   
50%          0.000000        0.000000      0.000000      0.000000   
75%          0.000000        0.000000      0.000000      0.000000   
max          1.000000        1.000000      1.000000      1.000000   

       SaleType_WD  SaleCondition_AdjLand  SaleCondition_Alloca  \
count  1460.000000            1460.000000           1460.000000   
mean      0.867808               0.002740              0.008219   
std       0.338815               0.052289              0.090317   
min       0.000000               0.000000              0.000000   
25%       1.000000               0.000000              0.000000   
50%       1.000000               0.000000              0.000000   
75%       1.000000               0.000000              0.000000   
max       1.000000               1.000000              1.000000   

       SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  
count           1460.000000           1460.000000            1460.000000  
mean               0.013699              0.820548               0.085616  
std                0.116277              0.383862               0.279893  
min                0.000000              0.000000               0.000000  
25%                0.000000              1.000000               0.000000  
50%                0.000000              1.000000               0.000000  
75%                0.000000              1.000000               0.000000  
max                1.000000              1.000000               1.000000  

[8 rows x 190 columns]
   Id  MSSubClass  LotFrontage  LotArea  Alley  LotShape  Utilities  \
0   1          60         65.0     8450      0         3          3   
1   2          20         80.0     9600      0         3          3   
2   3          60         68.0    11250      0         2          3   
3   4          70         60.0     9550      0         2          3   
4   5          60         84.0    14260      0         2          3   

   LandSlope  OverallQual  OverallCond  ...  SaleType_ConLI  SaleType_ConLw  \
0          2            7            5  ...               0               0   
1          2            6            8  ...               0               0   
2          2            7            5  ...               0               0   
3          2            7            5  ...               0               0   
4          2            8            5  ...               0               0   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_AdjLand  \
0             0             0            1                      0   
1             0             0            1                      0   
2             0             0            1                      0   
3             0             0            1                      0   
4             0             0            1                      0   

   SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \
0                     0                     0                     1   
1                     0                     0                     1   
2                     0                     0                     1   
3                     0                     0                     0   
4                     0                     0                     1   

   SaleCondition_Partial  
0                      0  
1                      0  
2                      0  
3                      0  
4                      0  

[5 rows x 190 columns]

Save Clean Train Data¶

In [178]:
train.to_csv("../data/clean_train.csv", index=False)

print("Cleaned train dataset saved as clean_train.csv.")
Cleaned train dataset saved as clean_train.csv.

Test Data Overview¶

In [179]:
print(test.info())
print(test.describe())
print(test.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallCond    1459 non-null   int64  
 19  YearBuilt      1459 non-null   int64  
 20  YearRemodAdd   1459 non-null   int64  
 21  RoofStyle      1459 non-null   object 
 22  RoofMatl       1459 non-null   object 
 23  Exterior1st    1458 non-null   object 
 24  Exterior2nd    1458 non-null   object 
 25  MasVnrType     1443 non-null   object 
 26  MasVnrArea     1444 non-null   float64
 27  ExterQual      1459 non-null   object 
 28  ExterCond      1459 non-null   object 
 29  Foundation     1459 non-null   object 
 30  BsmtQual       1415 non-null   object 
 31  BsmtCond       1414 non-null   object 
 32  BsmtExposure   1415 non-null   object 
 33  BsmtFinType1   1417 non-null   object 
 34  BsmtFinSF1     1458 non-null   float64
 35  BsmtFinType2   1417 non-null   object 
 36  BsmtFinSF2     1458 non-null   float64
 37  BsmtUnfSF      1458 non-null   float64
 38  TotalBsmtSF    1458 non-null   float64
 39  Heating        1459 non-null   object 
 40  HeatingQC      1459 non-null   object 
 41  CentralAir     1459 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1459 non-null   int64  
 44  2ndFlrSF       1459 non-null   int64  
 45  LowQualFinSF   1459 non-null   int64  
 46  GrLivArea      1459 non-null   int64  
 47  BsmtFullBath   1457 non-null   float64
 48  BsmtHalfBath   1457 non-null   float64
 49  FullBath       1459 non-null   int64  
 50  HalfBath       1459 non-null   int64  
 51  BedroomAbvGr   1459 non-null   int64  
 52  KitchenAbvGr   1459 non-null   int64  
 53  KitchenQual    1458 non-null   object 
 54  TotRmsAbvGrd   1459 non-null   int64  
 55  Functional     1457 non-null   object 
 56  Fireplaces     1459 non-null   int64  
 57  FireplaceQu    729 non-null    object 
 58  GarageType     1383 non-null   object 
 59  GarageYrBlt    1381 non-null   float64
 60  GarageFinish   1381 non-null   object 
 61  GarageCars     1458 non-null   float64
 62  GarageArea     1458 non-null   float64
 63  GarageQual     1381 non-null   object 
 64  GarageCond     1381 non-null   object 
 65  PavedDrive     1459 non-null   object 
 66  WoodDeckSF     1459 non-null   int64  
 67  OpenPorchSF    1459 non-null   int64  
 68  EnclosedPorch  1459 non-null   int64  
 69  3SsnPorch      1459 non-null   int64  
 70  ScreenPorch    1459 non-null   int64  
 71  PoolArea       1459 non-null   int64  
 72  PoolQC         3 non-null      object 
 73  Fence          290 non-null    object 
 74  MiscFeature    51 non-null     object 
 75  MiscVal        1459 non-null   int64  
 76  MoSold         1459 non-null   int64  
 77  YrSold         1459 non-null   int64  
 78  SaleType       1458 non-null   object 
 79  SaleCondition  1459 non-null   object 
dtypes: float64(11), int64(26), object(43)
memory usage: 912.0+ KB
None
                Id   MSSubClass  LotFrontage       LotArea  OverallQual  \
count  1459.000000  1459.000000  1232.000000   1459.000000  1459.000000   
mean   2190.000000    57.378341    68.580357   9819.161069     6.078821   
std     421.321334    42.746880    22.376841   4955.517327     1.436812   
min    1461.000000    20.000000    21.000000   1470.000000     1.000000   
25%    1825.500000    20.000000    58.000000   7391.000000     5.000000   
50%    2190.000000    50.000000    67.000000   9399.000000     6.000000   
75%    2554.500000    70.000000    80.000000  11517.500000     7.000000   
max    2919.000000   190.000000   200.000000  56600.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1459.000000  1459.000000   1459.000000  1444.000000  1458.000000  ...   
mean      5.553804  1971.357779   1983.662783   100.709141   439.203704  ...   
std       1.113740    30.390071     21.130467   177.625900   455.268042  ...   
min       1.000000  1879.000000   1950.000000     0.000000     0.000000  ...   
25%       5.000000  1953.000000   1963.000000     0.000000     0.000000  ...   
50%       5.000000  1973.000000   1992.000000     0.000000   350.500000  ...   
75%       6.000000  2001.000000   2004.000000   164.000000   753.500000  ...   
max       9.000000  2010.000000   2010.000000  1290.000000  4010.000000  ...   

        GarageArea   WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  \
count  1458.000000  1459.000000  1459.000000    1459.000000  1459.000000   
mean    472.768861    93.174777    48.313914      24.243317     1.794380   
std     217.048611   127.744882    68.883364      67.227765    20.207842   
min       0.000000     0.000000     0.000000       0.000000     0.000000   
25%     318.000000     0.000000     0.000000       0.000000     0.000000   
50%     480.000000     0.000000    28.000000       0.000000     0.000000   
75%     576.000000   168.000000    72.000000       0.000000     0.000000   
max    1488.000000  1424.000000   742.000000    1012.000000   360.000000   

       ScreenPorch     PoolArea       MiscVal       MoSold       YrSold  
count  1459.000000  1459.000000   1459.000000  1459.000000  1459.000000  
mean     17.064428     1.744345     58.167923     6.104181  2007.769705  
std      56.609763    30.491646    630.806978     2.722432     1.301740  
min       0.000000     0.000000      0.000000     1.000000  2006.000000  
25%       0.000000     0.000000      0.000000     4.000000  2007.000000  
50%       0.000000     0.000000      0.000000     6.000000  2008.000000  
75%       0.000000     0.000000      0.000000     8.000000  2009.000000  
max     576.000000   800.000000  17000.000000    12.000000  2010.000000  

[8 rows x 37 columns]
     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  
0       0      6    2010        WD         Normal  
1   12500      6    2010        WD         Normal  
2       0      3    2010        WD         Normal  
3       0      6    2010        WD         Normal  
4       0      1    2010        WD         Normal  

[5 rows x 80 columns]
In [180]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageCond        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrType        16
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
KitchenQual        1
TotalBsmtSF        1
Exterior2nd        1
GarageCars         1
Exterior1st        1
GarageArea         1
SaleType           1
dtype: int64

Data Cleaning¶

In [181]:
# PoolQC
test['PoolQC'].fillna('NoPool', inplace=True)
pool_map = {'NoPool': 0, 'Fa': 1, 'Gd': 2, 'Ex': 3}
test['PoolQC'] = test['PoolQC'].map(pool_map)
In [182]:
# MisFeature
test['MiscFeature'].fillna('None', inplace=True)
misc_map = {'None': 0, 'Shed': 1, 'Gar2': 2, 'Othr': 3, 'TenC': 4}
test['MiscFeature'] = test['MiscFeature'].map(misc_map)
In [183]:
# Alley
test['Alley'].fillna('NoAlley', inplace=True)
alley_map = {'NoAlley': 0, 'Grvl': 1, 'Pave': 2}
test['Alley'] = test['Alley'].map(alley_map)
In [184]:
# Fence
test['Fence'].fillna('NoFence', inplace=True)
fence_map = {'NoFence': 0, 'MnWw': 1, 'MnPrv': 2, 'GdWo': 3, 'GdPrv': 4}
test['Fence'] = test['Fence'].map(fence_map)
In [185]:
# FireplaceQu
test['FireplaceQu'].fillna('NoFireplace', inplace=True)
fireplace_map = {'NoFireplace': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['FireplaceQu'] = test['FireplaceQu'].map(fireplace_map)
In [186]:
# LotFrontage
test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
In [187]:
# GarageYrBlt GarageFinish, GarageQual, GarageCond
test['GarageYrBlt'].fillna(0, inplace=True)
garage_categorical_features = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_categorical_features:
    test[col].fillna('NoGarage', inplace=True)

garage_type_map = {'NoGarage': 0, 'Attchd': 1, 'Detchd': 2, 'BuiltIn': 3, 'Basment': 4, 'CarPort': 5, '2Types': 6}
test['GarageType'] = test['GarageType'].map(garage_type_map)

garage_finish_map = {'NoGarage': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}
test['GarageFinish'] = test['GarageFinish'].map(garage_finish_map)

garage_qual_cond_map = {'NoGarage': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['GarageQual'] = test['GarageQual'].map(garage_qual_cond_map)
test['GarageCond'] = test['GarageCond'].map(garage_qual_cond_map)
In [188]:
# BsmtFinType2, BsmtExposure, BsmtQual, BsmtCond, BsmtFinType1
bsmt_categorical_features = ['BsmtFinType1', 'BsmtFinType2', 'BsmtExposure', 'BsmtQual', 'BsmtCond']
for col in bsmt_categorical_features:
    test[col].fillna('NoBasement', inplace=True)

bsmt_numerical_features = ['TotalBsmtSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath', 'BsmtHalfBath']
for col in bsmt_numerical_features:
    if col in test.columns:
        test[col].fillna(0, inplace=True)

bsmt_fin_type_map = {'NoBasement': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
test['BsmtFinType1'] = test['BsmtFinType1'].map(bsmt_fin_type_map)
test['BsmtFinType2'] = test['BsmtFinType2'].map(bsmt_fin_type_map)

bsmt_exposure_map = {'NoBasement': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
test['BsmtExposure'] = test['BsmtExposure'].map(bsmt_exposure_map)

bsmt_qual_cond_map = {'NoBasement': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
test['BsmtQual'] = test['BsmtQual'].map(bsmt_qual_cond_map)
test['BsmtCond'] = test['BsmtCond'].map(bsmt_qual_cond_map)
In [189]:
# MasVnrType, MasVnrArea
test['MasVnrType'].fillna('None', inplace=True)
test['MasVnrArea'].fillna(0, inplace=True)

masvnr_type_map = {'None': 0, 'BrkCmn': 1, 'BrkFace': 2, 'Stone': 3}
test['MasVnrType'] = test['MasVnrType'].map(masvnr_type_map)
In [190]:
# Electrical
test['Electrical'].fillna('SBrkr', inplace=True)
electrical_map = {'SBrkr': 0, 'FuseA': 1, 'FuseF': 2, 'FuseP': 3, 'Mix': 4}
test['Electrical'] = test['Electrical'].map(electrical_map)
In [191]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
MSZoning       4
Utilities      2
Functional     2
GarageCars     1
GarageArea     1
KitchenQual    1
Exterior1st    1
Exterior2nd    1
SaleType       1
dtype: int64
In [192]:
train1 = pd.read_csv('../data/train.csv')
test['Utilities'].fillna(train1['Utilities'].mode()[0], inplace=True)
test['Functional'].fillna(train1['Functional'].mode()[0], inplace=True)
test['GarageCars'].fillna(0, inplace=True)
test['GarageArea'].fillna(0, inplace=True)
test['KitchenQual'].fillna(train1['KitchenQual'].mode()[0], inplace=True)
test['MSZoning'].fillna(train1['MSZoning'].mode()[0], inplace=True)
test['Exterior1st'].fillna(train1['Exterior1st'].mode()[0], inplace=True)
test['Exterior2nd'].fillna(train1['Exterior2nd'].mode()[0], inplace=True)
test['SaleType'].fillna(train1['SaleType'].mode()[0], inplace=True)
In [193]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1459 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          1459 non-null   int64  
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1459 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallCond    1459 non-null   int64  
 19  YearBuilt      1459 non-null   int64  
 20  YearRemodAdd   1459 non-null   int64  
 21  RoofStyle      1459 non-null   object 
 22  RoofMatl       1459 non-null   object 
 23  Exterior1st    1459 non-null   object 
 24  Exterior2nd    1459 non-null   object 
 25  MasVnrType     1459 non-null   int64  
 26  MasVnrArea     1459 non-null   float64
 27  ExterQual      1459 non-null   object 
 28  ExterCond      1459 non-null   object 
 29  Foundation     1459 non-null   object 
 30  BsmtQual       1459 non-null   int64  
 31  BsmtCond       1459 non-null   int64  
 32  BsmtExposure   1459 non-null   int64  
 33  BsmtFinType1   1459 non-null   int64  
 34  BsmtFinSF1     1459 non-null   float64
 35  BsmtFinType2   1459 non-null   int64  
 36  BsmtFinSF2     1459 non-null   float64
 37  BsmtUnfSF      1459 non-null   float64
 38  TotalBsmtSF    1459 non-null   float64
 39  Heating        1459 non-null   object 
 40  HeatingQC      1459 non-null   object 
 41  CentralAir     1459 non-null   object 
 42  Electrical     1459 non-null   int64  
 43  1stFlrSF       1459 non-null   int64  
 44  2ndFlrSF       1459 non-null   int64  
 45  LowQualFinSF   1459 non-null   int64  
 46  GrLivArea      1459 non-null   int64  
 47  BsmtFullBath   1459 non-null   float64
 48  BsmtHalfBath   1459 non-null   float64
 49  FullBath       1459 non-null   int64  
 50  HalfBath       1459 non-null   int64  
 51  BedroomAbvGr   1459 non-null   int64  
 52  KitchenAbvGr   1459 non-null   int64  
 53  KitchenQual    1459 non-null   object 
 54  TotRmsAbvGrd   1459 non-null   int64  
 55  Functional     1459 non-null   object 
 56  Fireplaces     1459 non-null   int64  
 57  FireplaceQu    1459 non-null   int64  
 58  GarageType     1459 non-null   int64  
 59  GarageYrBlt    1459 non-null   float64
 60  GarageFinish   1459 non-null   int64  
 61  GarageCars     1459 non-null   float64
 62  GarageArea     1459 non-null   float64
 63  GarageQual     1459 non-null   int64  
 64  GarageCond     1459 non-null   int64  
 65  PavedDrive     1459 non-null   object 
 66  WoodDeckSF     1459 non-null   int64  
 67  OpenPorchSF    1459 non-null   int64  
 68  EnclosedPorch  1459 non-null   int64  
 69  3SsnPorch      1459 non-null   int64  
 70  ScreenPorch    1459 non-null   int64  
 71  PoolArea       1459 non-null   int64  
 72  PoolQC         1459 non-null   int64  
 73  Fence          1459 non-null   int64  
 74  MiscFeature    1459 non-null   int64  
 75  MiscVal        1459 non-null   int64  
 76  MoSold         1459 non-null   int64  
 77  YrSold         1459 non-null   int64  
 78  SaleType       1459 non-null   object 
 79  SaleCondition  1459 non-null   object 
dtypes: float64(11), int64(42), object(27)
memory usage: 912.0+ KB
None
In [194]:
categorical_columns = test.select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = test.select_dtypes(exclude=['object']).columns.tolist()

print("Categorical Columns:")
print(categorical_columns)

print("\nNon-Categorical Columns:")
print(non_categorical_columns)
Categorical Columns:
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

Non-Categorical Columns:
['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'Alley', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold']
In [195]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1459 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          1459 non-null   int64  
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1459 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallCond    1459 non-null   int64  
 19  YearBuilt      1459 non-null   int64  
 20  YearRemodAdd   1459 non-null   int64  
 21  RoofStyle      1459 non-null   object 
 22  RoofMatl       1459 non-null   object 
 23  Exterior1st    1459 non-null   object 
 24  Exterior2nd    1459 non-null   object 
 25  MasVnrType     1459 non-null   int64  
 26  MasVnrArea     1459 non-null   float64
 27  ExterQual      1459 non-null   object 
 28  ExterCond      1459 non-null   object 
 29  Foundation     1459 non-null   object 
 30  BsmtQual       1459 non-null   int64  
 31  BsmtCond       1459 non-null   int64  
 32  BsmtExposure   1459 non-null   int64  
 33  BsmtFinType1   1459 non-null   int64  
 34  BsmtFinSF1     1459 non-null   float64
 35  BsmtFinType2   1459 non-null   int64  
 36  BsmtFinSF2     1459 non-null   float64
 37  BsmtUnfSF      1459 non-null   float64
 38  TotalBsmtSF    1459 non-null   float64
 39  Heating        1459 non-null   object 
 40  HeatingQC      1459 non-null   object 
 41  CentralAir     1459 non-null   object 
 42  Electrical     1459 non-null   int64  
 43  1stFlrSF       1459 non-null   int64  
 44  2ndFlrSF       1459 non-null   int64  
 45  LowQualFinSF   1459 non-null   int64  
 46  GrLivArea      1459 non-null   int64  
 47  BsmtFullBath   1459 non-null   float64
 48  BsmtHalfBath   1459 non-null   float64
 49  FullBath       1459 non-null   int64  
 50  HalfBath       1459 non-null   int64  
 51  BedroomAbvGr   1459 non-null   int64  
 52  KitchenAbvGr   1459 non-null   int64  
 53  KitchenQual    1459 non-null   object 
 54  TotRmsAbvGrd   1459 non-null   int64  
 55  Functional     1459 non-null   object 
 56  Fireplaces     1459 non-null   int64  
 57  FireplaceQu    1459 non-null   int64  
 58  GarageType     1459 non-null   int64  
 59  GarageYrBlt    1459 non-null   float64
 60  GarageFinish   1459 non-null   int64  
 61  GarageCars     1459 non-null   float64
 62  GarageArea     1459 non-null   float64
 63  GarageQual     1459 non-null   int64  
 64  GarageCond     1459 non-null   int64  
 65  PavedDrive     1459 non-null   object 
 66  WoodDeckSF     1459 non-null   int64  
 67  OpenPorchSF    1459 non-null   int64  
 68  EnclosedPorch  1459 non-null   int64  
 69  3SsnPorch      1459 non-null   int64  
 70  ScreenPorch    1459 non-null   int64  
 71  PoolArea       1459 non-null   int64  
 72  PoolQC         1459 non-null   int64  
 73  Fence          1459 non-null   int64  
 74  MiscFeature    1459 non-null   int64  
 75  MiscVal        1459 non-null   int64  
 76  MoSold         1459 non-null   int64  
 77  YrSold         1459 non-null   int64  
 78  SaleType       1459 non-null   object 
 79  SaleCondition  1459 non-null   object 
dtypes: float64(11), int64(42), object(27)
memory usage: 912.0+ KB
None
{'Exterior1st_AsphShn', 'MSZoning_RL', 'Exterior2nd_ImStucc', 'Condition1_RRAe', 'Condition1_RRNe', 'Neighborhood_SawyerW', 'MSZoning_RM', 'Condition1_Norm', 'HouseStyle_2.5Unf', 'MSZoning_RH', 'SaleType_New', 'SaleCondition_Normal', 'Neighborhood_NWAmes', 'Exterior1st_BrkComm', 'Neighborhood_BrkSide', 'Condition1_Feedr', 'Neighborhood_NPkVill', 'RoofMatl_CompShg', 'SaleType_Con', 'Exterior2nd_MetalSd', 'Foundation_CBlock', 'Foundation_Stone', 'Condition2_RRAe', 'Neighborhood_IDOTRR', 'Exterior1st_Plywood', 'Neighborhood_NridgHt', 'Exterior2nd_CBlock', 'MSZoning_FV', 'SaleCondition_AdjLand', 'Neighborhood_NoRidge', 'HouseStyle_1Story', 'Condition2_PosN', 'Neighborhood_Somerst', 'SaleCondition_Alloca', 'Exterior2nd_Plywood', 'Exterior1st_WdShing', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_StoneBr', 'RoofMatl_Membran', 'Condition2_RRAn', 'Exterior1st_CBlock', 'Exterior1st_Wd Sdng', 'SaleType_Oth', 'Neighborhood_NAmes', 'RoofMatl_WdShngl', 'Exterior2nd_Stone', 'BldgType_2fmCon', 'Foundation_Wood', 'Foundation_PConc', 'LotConfig_CulDSac', 'LandContour_Low', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd', 'Exterior2nd_Brk Cmn', 'Foundation_Slab', 'Exterior2nd_Wd Sdng', 'Street_Pave', 'SalePrice', 'HouseStyle_SLvl', 'RoofMatl_WdShake', 'SaleType_WD', 'HouseStyle_1.5Unf', 'Condition2_RRNn', 'Exterior1st_CemntBd', 'Neighborhood_Gilbert', 'Condition2_PosA', 'LotConfig_FR2', 'Condition2_Norm', 'BldgType_TwnhsE', 'HouseStyle_SFoyer', 'BldgType_Twnhs', 'SaleCondition_Partial', 'Exterior1st_MetalSd', 'SaleType_ConLw', 'Condition1_PosN', 'Neighborhood_CollgCr', 'Exterior2nd_Wd Shng', 'LotConfig_Inside', 'LandContour_Lvl', 'RoofStyle_Gambrel', 'RoofStyle_Mansard', 'Exterior1st_Stone', 'SaleType_ConLI', 'Neighborhood_MeadowV', 'Neighborhood_Timber', 'RoofStyle_Shed', 'Heating_Wall', 'Exterior1st_ImStucc', 'BldgType_Duplex', 'Neighborhood_Edwards', 'Exterior1st_HdBoard', 'Exterior2nd_CmentBd', 'Condition2_Feedr', 'Neighborhood_Veenker', 'RoofMatl_Metal', 'Neighborhood_Crawfor', 'Exterior1st_BrkFace', 'Exterior2nd_Stucco', 'RoofStyle_Gable', 'LotConfig_FR3', 'Heating_OthW', 'RoofMatl_Tar&Grv', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'HouseStyle_2.5Fin', 'Neighborhood_BrDale', 'Condition1_RRNn', 'Condition1_PosA', 'Neighborhood_Mitchel', 'HouseStyle_2Story', 'Heating_GasA', 'SaleType_CWD', 'Heating_Grav', 'Exterior2nd_BrkFace', 'SaleCondition_Family', 'Exterior2nd_AsphShn', 'LandContour_HLS', 'Neighborhood_Blueste', 'Exterior2nd_Other', 'Neighborhood_Sawyer', 'Heating_GasW', 'RoofStyle_Hip', 'Neighborhood_ClearCr', 'SaleType_ConLD', 'RoofMatl_Roll', 'Condition1_RRAn'}
In [196]:
ordinal_mappings = {
    'LotShape': {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
    'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
    'LandSlope': {'Gtl': 2, 'Mod': 1, 'Sev': 0},
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1}
}

ordinal_features = list(ordinal_mappings.keys())


for feature, mapping in ordinal_mappings.items():
    if feature in test.columns:
        test[feature] = test[feature].map(mapping)
In [197]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1459 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          1459 non-null   int64  
 7   LotShape       1459 non-null   int64  
 8   LandContour    1459 non-null   object 
 9   Utilities      1459 non-null   int64  
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   int64  
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallCond    1459 non-null   int64  
 19  YearBuilt      1459 non-null   int64  
 20  YearRemodAdd   1459 non-null   int64  
 21  RoofStyle      1459 non-null   object 
 22  RoofMatl       1459 non-null   object 
 23  Exterior1st    1459 non-null   object 
 24  Exterior2nd    1459 non-null   object 
 25  MasVnrType     1459 non-null   int64  
 26  MasVnrArea     1459 non-null   float64
 27  ExterQual      1459 non-null   int64  
 28  ExterCond      1459 non-null   int64  
 29  Foundation     1459 non-null   object 
 30  BsmtQual       1459 non-null   int64  
 31  BsmtCond       1459 non-null   int64  
 32  BsmtExposure   1459 non-null   int64  
 33  BsmtFinType1   1459 non-null   int64  
 34  BsmtFinSF1     1459 non-null   float64
 35  BsmtFinType2   1459 non-null   int64  
 36  BsmtFinSF2     1459 non-null   float64
 37  BsmtUnfSF      1459 non-null   float64
 38  TotalBsmtSF    1459 non-null   float64
 39  Heating        1459 non-null   object 
 40  HeatingQC      1459 non-null   int64  
 41  CentralAir     1459 non-null   int64  
 42  Electrical     1459 non-null   int64  
 43  1stFlrSF       1459 non-null   int64  
 44  2ndFlrSF       1459 non-null   int64  
 45  LowQualFinSF   1459 non-null   int64  
 46  GrLivArea      1459 non-null   int64  
 47  BsmtFullBath   1459 non-null   float64
 48  BsmtHalfBath   1459 non-null   float64
 49  FullBath       1459 non-null   int64  
 50  HalfBath       1459 non-null   int64  
 51  BedroomAbvGr   1459 non-null   int64  
 52  KitchenAbvGr   1459 non-null   int64  
 53  KitchenQual    1459 non-null   int64  
 54  TotRmsAbvGrd   1459 non-null   int64  
 55  Functional     1459 non-null   int64  
 56  Fireplaces     1459 non-null   int64  
 57  FireplaceQu    1459 non-null   int64  
 58  GarageType     1459 non-null   int64  
 59  GarageYrBlt    1459 non-null   float64
 60  GarageFinish   1459 non-null   int64  
 61  GarageCars     1459 non-null   float64
 62  GarageArea     1459 non-null   float64
 63  GarageQual     1459 non-null   int64  
 64  GarageCond     1459 non-null   int64  
 65  PavedDrive     1459 non-null   int64  
 66  WoodDeckSF     1459 non-null   int64  
 67  OpenPorchSF    1459 non-null   int64  
 68  EnclosedPorch  1459 non-null   int64  
 69  3SsnPorch      1459 non-null   int64  
 70  ScreenPorch    1459 non-null   int64  
 71  PoolArea       1459 non-null   int64  
 72  PoolQC         1459 non-null   int64  
 73  Fence          1459 non-null   int64  
 74  MiscFeature    1459 non-null   int64  
 75  MiscVal        1459 non-null   int64  
 76  MoSold         1459 non-null   int64  
 77  YrSold         1459 non-null   int64  
 78  SaleType       1459 non-null   object 
 79  SaleCondition  1459 non-null   object 
dtypes: float64(11), int64(52), object(17)
memory usage: 912.0+ KB
None
{'Exterior1st_AsphShn', 'MSZoning_RL', 'Exterior2nd_ImStucc', 'Condition1_RRAe', 'Condition1_RRNe', 'Neighborhood_SawyerW', 'MSZoning_RM', 'Condition1_Norm', 'HouseStyle_2.5Unf', 'MSZoning_RH', 'SaleType_New', 'SaleCondition_Normal', 'Neighborhood_NWAmes', 'Exterior1st_BrkComm', 'Neighborhood_BrkSide', 'Condition1_Feedr', 'Neighborhood_NPkVill', 'RoofMatl_CompShg', 'SaleType_Con', 'Exterior2nd_MetalSd', 'Foundation_CBlock', 'Foundation_Stone', 'Condition2_RRAe', 'Neighborhood_IDOTRR', 'Exterior1st_Plywood', 'Neighborhood_NridgHt', 'Exterior2nd_CBlock', 'MSZoning_FV', 'SaleCondition_AdjLand', 'Neighborhood_NoRidge', 'HouseStyle_1Story', 'Condition2_PosN', 'Neighborhood_Somerst', 'SaleCondition_Alloca', 'Exterior2nd_Plywood', 'Exterior1st_WdShing', 'Neighborhood_OldTown', 'Neighborhood_SWISU', 'Neighborhood_StoneBr', 'RoofMatl_Membran', 'Condition2_RRAn', 'Exterior1st_CBlock', 'Exterior1st_Wd Sdng', 'SaleType_Oth', 'Neighborhood_NAmes', 'RoofMatl_WdShngl', 'Exterior2nd_Stone', 'BldgType_2fmCon', 'Foundation_Wood', 'Foundation_PConc', 'LotConfig_CulDSac', 'LandContour_Low', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd', 'Exterior2nd_Brk Cmn', 'Foundation_Slab', 'Exterior2nd_Wd Sdng', 'Street_Pave', 'SalePrice', 'HouseStyle_SLvl', 'RoofMatl_WdShake', 'SaleType_WD', 'HouseStyle_1.5Unf', 'Condition2_RRNn', 'Exterior1st_CemntBd', 'Neighborhood_Gilbert', 'Condition2_PosA', 'LotConfig_FR2', 'Condition2_Norm', 'BldgType_TwnhsE', 'HouseStyle_SFoyer', 'BldgType_Twnhs', 'SaleCondition_Partial', 'Exterior1st_MetalSd', 'SaleType_ConLw', 'Condition1_PosN', 'Neighborhood_CollgCr', 'Exterior2nd_Wd Shng', 'LotConfig_Inside', 'LandContour_Lvl', 'RoofStyle_Gambrel', 'RoofStyle_Mansard', 'Exterior1st_Stone', 'SaleType_ConLI', 'Neighborhood_MeadowV', 'Neighborhood_Timber', 'RoofStyle_Shed', 'Heating_Wall', 'Exterior1st_ImStucc', 'BldgType_Duplex', 'Neighborhood_Edwards', 'Exterior1st_HdBoard', 'Exterior2nd_CmentBd', 'Condition2_Feedr', 'Neighborhood_Veenker', 'RoofMatl_Metal', 'Neighborhood_Crawfor', 'Exterior1st_BrkFace', 'Exterior2nd_Stucco', 'RoofStyle_Gable', 'LotConfig_FR3', 'Heating_OthW', 'RoofMatl_Tar&Grv', 'Exterior1st_Stucco', 'Exterior1st_VinylSd', 'HouseStyle_2.5Fin', 'Neighborhood_BrDale', 'Condition1_RRNn', 'Condition1_PosA', 'Neighborhood_Mitchel', 'HouseStyle_2Story', 'Heating_GasA', 'SaleType_CWD', 'Heating_Grav', 'Exterior2nd_BrkFace', 'SaleCondition_Family', 'Exterior2nd_AsphShn', 'LandContour_HLS', 'Neighborhood_Blueste', 'Exterior2nd_Other', 'Neighborhood_Sawyer', 'Heating_GasW', 'RoofStyle_Hip', 'Neighborhood_ClearCr', 'SaleType_ConLD', 'RoofMatl_Roll', 'Condition1_RRAn'}
In [198]:
ohe_columns = [
    'MSZoning', 'Street', 'LandContour', 'LotConfig', 
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 
    'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
    'Exterior2nd', 'Foundation', 'Heating', 'SaleType', 'SaleCondition'
]

test = pd.get_dummies(test, columns=ohe_columns, drop_first=True)
In [199]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
print(test.info())
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 176 entries, Id to SaleCondition_Partial
dtypes: float64(11), int64(52), uint8(113)
memory usage: 879.2 KB
None
{'RoofMatl_Roll', 'RoofMatl_Membran', 'Condition2_RRAe', 'Condition2_RRAn', 'Heating_OthW', 'Exterior2nd_Other', 'SalePrice', 'HouseStyle_2.5Fin', 'Heating_GasA', 'RoofMatl_CompShg', 'Exterior1st_Stone', 'Condition2_RRNn', 'RoofMatl_Metal', 'Exterior1st_ImStucc'}
In [200]:
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0

test = test[train.columns]
In [201]:
if 'SalePrice' in test.columns:
    test = test.drop(columns=['SalePrice'])
In [202]:
missing_data = test.isnull().sum().sort_values(ascending=False)
print(missing_data[missing_data > 0])
Series([], dtype: int64)
In [203]:
missing_cols = set(train.columns) - set(test.columns)
print(missing_cols)
{'SalePrice'}
In [204]:
print(test.info())
print(test.describe())
print(test.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 189 entries, Id to SaleCondition_Partial
dtypes: float64(11), int64(65), uint8(113)
memory usage: 1.0 MB
None
                Id   MSSubClass  LotFrontage       LotArea        Alley  \
count  1459.000000  1459.000000  1459.000000   1459.000000  1459.000000   
mean   2190.000000    57.378341    68.955106   9819.161069     0.098698   
std     421.321334    42.746880    20.999091   4955.517327     0.373861   
min    1461.000000    20.000000    21.000000   1470.000000     0.000000   
25%    1825.500000    20.000000    60.000000   7391.000000     0.000000   
50%    2190.000000    50.000000    70.000000   9399.000000     0.000000   
75%    2554.500000    70.000000    80.000000  11517.500000     0.000000   
max    2919.000000   190.000000   200.000000  56600.000000     2.000000   

          LotShape  Utilities    LandSlope  OverallQual  OverallCond  ...  \
count  1459.000000     1459.0  1459.000000  1459.000000  1459.000000  ...   
mean      2.607951        3.0     1.954764     6.078821     5.553804  ...   
std       0.557864        0.0     0.217566     1.436812     1.113740  ...   
min       0.000000        3.0     0.000000     1.000000     1.000000  ...   
25%       2.000000        3.0     2.000000     5.000000     5.000000  ...   
50%       3.000000        3.0     2.000000     6.000000     5.000000  ...   
75%       3.000000        3.0     2.000000     7.000000     6.000000  ...   
max       3.000000        3.0     2.000000    10.000000     9.000000  ...   

       SaleType_ConLI  SaleType_ConLw  SaleType_New  SaleType_Oth  \
count     1459.000000     1459.000000   1459.000000   1459.000000   
mean         0.002742        0.002056      0.080192      0.002742   
std          0.052306        0.045314      0.271683      0.052306   
min          0.000000        0.000000      0.000000      0.000000   
25%          0.000000        0.000000      0.000000      0.000000   
50%          0.000000        0.000000      0.000000      0.000000   
75%          0.000000        0.000000      0.000000      0.000000   
max          1.000000        1.000000      1.000000      1.000000   

       SaleType_WD  SaleCondition_AdjLand  SaleCondition_Alloca  \
count   1459.00000            1459.000000           1459.000000   
mean       0.86292               0.005483              0.008225   
std        0.34405               0.073871              0.090348   
min        0.00000               0.000000              0.000000   
25%        1.00000               0.000000              0.000000   
50%        1.00000               0.000000              0.000000   
75%        1.00000               0.000000              0.000000   
max        1.00000               1.000000              1.000000   

       SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  
count           1459.000000           1459.000000            1459.000000  
mean               0.017820              0.825223               0.082248  
std                0.132344              0.379907               0.274837  
min                0.000000              0.000000               0.000000  
25%                0.000000              1.000000               0.000000  
50%                0.000000              1.000000               0.000000  
75%                0.000000              1.000000               0.000000  
max                1.000000              1.000000               1.000000  

[8 rows x 189 columns]
     Id  MSSubClass  LotFrontage  LotArea  Alley  LotShape  Utilities  \
0  1461          20         80.0    11622      0         3          3   
1  1462          20         81.0    14267      0         2          3   
2  1463          60         74.0    13830      0         2          3   
3  1464          60         78.0     9978      0         2          3   
4  1465         120         43.0     5005      0         2          3   

   LandSlope  OverallQual  OverallCond  ...  SaleType_ConLI  SaleType_ConLw  \
0          2            5            6  ...               0               0   
1          2            6            6  ...               0               0   
2          2            5            5  ...               0               0   
3          2            6            6  ...               0               0   
4          2            8            5  ...               0               0   

   SaleType_New  SaleType_Oth  SaleType_WD  SaleCondition_AdjLand  \
0             0             0            1                      0   
1             0             0            1                      0   
2             0             0            1                      0   
3             0             0            1                      0   
4             0             0            1                      0   

   SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \
0                     0                     0                     1   
1                     0                     0                     1   
2                     0                     0                     1   
3                     0                     0                     1   
4                     0                     0                     1   

   SaleCondition_Partial  
0                      0  
1                      0  
2                      0  
3                      0  
4                      0  

[5 rows x 189 columns]

Save Clean Test Data¶

In [205]:
test.to_csv('../data/clean_test.csv', index=False)
print("Cleaned test dataset saved as clean_test.csv.")
Cleaned test dataset saved as clean_test.csv.